Libraries

library(arm)
library(car)
library(here)
library(tidyverse)
library(ggeffects)
library(nlstools)
library(MuMIn)
library(GGally)
library(glmmTMB)
library(ggResidpanel)
library(gridExtra)
library(mgcv)
library(sp)
library(ggmap)
library(lubridate)

Data

raw_counts <- read_csv(here("week_11", "data", "HOT_cyto_counts_edit.csv"))

colnames(raw_counts)[colnames(raw_counts) == "date"] <- "Date"

# Loop through each string
for (i in 1:length(raw_counts$Date)) {
  # Check if the string has exactly 5 digits
  if (nchar(gsub("[^0-9]", "", raw_counts$Date[i])) == 5) {
    raw_counts$Date[i] <- paste0("0", raw_counts$Date[i])
  }
}

raw_counts$Date<-as.character(raw_counts$Date)
raw_counts$Date<-mdy(raw_counts$Date)
raw_mlds <- read_csv(here("week_11", "data", "hot_mlds.csv"))
#clean data 
clean <- raw_counts %>%
  select(Date,press,chl,hbact,pro,picoeuk,crn=cruise)%>%
  full_join(raw_mlds, by= join_by(crn))%>%
  mutate(yd = yday(Date))

1.

ggpairs(clean, columns = c('pro','hbact', 'picoeuk', 'press', 'yd'), lower = list(continuous = wrap("points", alpha = 0.3, bg = 'blue', pch = 21)))

hist(clean$pro)

hist(clean$hbact)

hist(sqrt(clean$picoeuk))

This looks overdispersed to me. I am going to try doing the Tweedy Distribution because I am curious about it for my own research and BY GOD I am going to shoehorn it into one of these homeworks I guess.

For each of the three groups of microbes, fit a 2D smoother that characterizes how abundance changes with depth and with day of the year (i.e., from day 1 to day 365 or 366).

# GAM:
prom = gam(pro ~ s(yd,press),family=tw(), data= clean)
hbacm= gam(hbact ~ s(yd,press),family =tw(), data= clean)
picm=gam(sqrt(picoeuk) ~ s(yd,press), data= clean)

gam.check(prom)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-1.529346e-05,1.409724e-05]
## (score 199.6047 & scale 0.1090976).
## Hessian positive definite, eigenvalue range [12.31667,731.8897].
## Model rank =  30 / 30 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##               k'  edf k-index p-value
## s(yd,press) 29.0 27.6    1.06       1
gam.check(hbacm)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-1.868078e-08,3.925217e-05]
## (score 1176.206 & scale 0.08683606).
## Hessian positive definite, eigenvalue range [0.531579,623.4795].
## Model rank =  30 / 30 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##               k'  edf k-index p-value
## s(yd,press) 29.0 24.2    0.97    0.17
gam.check(picm)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 5 iterations.
## The RMS GCV score gradient at convergence was 6.498151e-07 .
## The Hessian was positive definite.
## Model rank =  30 / 30 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##               k'  edf k-index p-value
## s(yd,press) 29.0 24.3    1.03    0.85

Simulate some Tweedie distribution data to see what the residuals look like:

set.seed(3)
n<-400
## Simulate data...
dat <- gamSim(1,n=n,dist="poisson",scale=.2)
## Gu & Wahba 4 term additive model
dat$y <- rTweedie(exp(dat$f),p=1.3,phi=.5) ## Tweedie response

## Fit a fixed p Tweedie, with wrong link ...
b <- gam(y~s(x0)+s(x1)+s(x2)+s(x3),family=Tweedie(1.25,power(.1)),
         data=dat)
gam.check(b)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 9 iterations.
## Gradient range [-2.394406e-07,2.353204e-09]
## (score 0.5404011 & scale 0.4951061).
## Hessian positive definite, eigenvalue range [2.394069e-07,0.003074717].
## Model rank =  37 / 37 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##         k'  edf k-index p-value
## s(x0) 9.00 2.71    1.06    0.87
## s(x1) 9.00 2.66    0.96    0.23
## s(x2) 9.00 7.36    0.99    0.48
## s(x3) 9.00 1.00    1.02    0.69

When fitting the 2D smoother for each type of microbe, consider how the response variable should be modeled (transformed or not, normal or non-normal).

Ugh, I have been stuck on this for a long time. Based on the best residuals I could get and what I know about distributions at this moment, I went with tweedie for Pro and Hbac and just a square root transform for Picoeuk. I think especially Pro and Picoeuk can be better fit, but I should finish this and maybe figure out what I am missing along the way.

Consider whether the basis dimension needs to be increased beyond the default value. I don’t think it does? All of the k’ values are close to 1 and the p-values are high. They might be close to the edf though but I am not sure how much counts as “too close”

Plot the fitted smoother in a way that is visually appealing.

plot(prom, select = 1, lwd = 2, scheme = 2, rug = T, se = F, main = "Prochlorococcus concentration")

plot(hbacm, select = 1, lwd = 2, scheme = 2, rug = T, se = F, main = "heterotrophic bacteria concentration")

plot(picm, select = 1, lwd = 2, scheme = 2, rug = T, se = F, main = "photosynthetic picoeukaryote concentration")

Finally, figure out how to test whether the relationship between abundance and depth changes over time or not.

prom2 = gam(pro ~ s(yd)+ s(press) + s(yd, by= press),family=tw(), data= clean)
summary(prom2)
## 
## Family: Tweedie(p=1.429) 
## Link function: log 
## 
## Formula:
## pro ~ s(yd) + s(press) + s(yd, by = press)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)        0          0     NaN      NaN
## 
## Approximate significance of smooth terms:
##               edf Ref.df       F p-value    
## s(yd)       4.081  4.978   13.62  <2e-16 ***
## s(press)    6.560  7.125 1210.59  <2e-16 ***
## s(yd):press 7.044  8.127   69.56  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 28/29
## R-sq.(adj) =  0.835   Deviance explained = 87.1%
## -REML = 200.66  Scale est. = 0.11221   n = 1217
hbacm2= gam(hbact ~ s(yd)+s(press)+ s(yd, by= press),family =tw(), data= clean)
summary(hbacm2)
## 
## Family: Tweedie(p=1.138) 
## Link function: log 
## 
## Formula:
## hbact ~ s(yd) + s(press) + s(yd, by = press)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)        0          0     NaN      NaN
## 
## Approximate significance of smooth terms:
##               edf Ref.df        F p-value    
## s(yd)       6.394  7.413    18.47  <2e-16 ***
## s(press)    5.400  6.308  7105.77  <2e-16 ***
## s(yd):press 6.354  7.375 10016.41  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 28/29
## R-sq.(adj) =  0.731   Deviance explained = 74.8%
## -REML = 1182.5  Scale est. = 0.085693  n = 1217
picm2=gam(sqrt(picoeuk) ~ s(yd) + s(press) + s(yd, by= press), data= clean)
summary(picm2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## sqrt(picoeuk) ~ s(yd) + s(press) + s(yd, by = press)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.040324   0.007049   5.721 1.34e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df      F p-value    
## s(yd)       8.408  8.864  17.30  <2e-16 ***
## s(press)    5.964  6.680 132.96  <2e-16 ***
## s(yd):press 4.404  5.201  30.98  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 28/29
## R-sq.(adj) =  0.516   Deviance explained = 52.3%
## GCV = 0.00060129  Scale est. = 0.00059187  n = 1217

What are your interpretations of the results so far? It looks like there’s definitely variation by depth by all three, where fewer guys like to hang out at deeper depths. I am more suspicious of whether they vary by time or not because changing k’ pretty dramatically changed the way the plot was structured, at least. The summaries with the interaction between time and depth did look significant for all of them, although the picoeuk did not have a lot of the deviance explained by the model I made.

#2.

Now let’s compare the niches of the three groups to each other.

Use a GAM including all groups simultaneously to simultaneously test three questions:

data:

long <- clean %>%
  pivot_longer(cols = hbact:picoeuk, 
               names_to = "microbe", #new variables
               values_to = "conc") #name values
hist(long$conc)

(a) Do the different kinds of microbes have different mean abundances? (b) Do the different kinds of microbes have different average depth distributions (i.e., averaging over time)? (c) Do the different kinds of microbes have different average seasonal dynamics (i.e., averaging over depths)?

GAM:

groupm= gam(conc ~ microbe + s(yd)+s(press)+ s(yd,press), family=tw(), data= long)
gam.check(groupm)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-0.0002031961,0.0004822445]
## (score -1188.631 & scale 0.2230164).
## Hessian positive definite, eigenvalue range [0.04063097,2526.054].
## Model rank =  48 / 48 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                k'   edf k-index p-value
## s(yd)        9.00  1.64    1.09       1
## s(press)     9.00  5.87    1.15       1
## s(yd,press) 27.00 12.01    1.07       1
summary(groupm)
## 
## Family: Tweedie(p=1.712) 
## Link function: log 
## 
## Formula:
## conc ~ microbe + s(yd) + s(press) + s(yd, press)
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.30761    0.01122  116.56   <2e-16 ***
## microbepicoeuk -5.99891    0.02872 -208.87   <2e-16 ***
## microbepro     -1.25614    0.01739  -72.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df      F p-value    
## s(yd)        1.644  1.794  0.394   0.732    
## s(press)     5.867  6.663 77.240  <2e-16 ***
## s(yd,press) 12.012 27.000  2.801  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.867   Deviance explained =   91%
## -REML = -1188.6  Scale est. = 0.22302   n = 3651

Now that you have fit models to test questions (a)-(c), make appropriate plots and perform appropriate hypothesis tests.

plot(groupm, select=3, lwd = 2, scheme = 2, rug = T, se = F)

This sucks! I have not gotten the connection between this assignment and the class example enough to make this work yet.

How do you intepret the results? I whiffed these. I am not sure how to easily separate micrboes for plots and I bet I will feel dumb when we go over this homework. Er, it sure looks like everything is important but I know that p values get shrunk when you don’t have k’ at the right level!

3.

Using only data from the top 45 meters, how does the concentration of each group of microbes vary with:

filter <- long%>%
  filter(press<46)

(a) mixed layer depth (b) Chl a concentration?

gm2= gam(conc ~ microbe + s(mean) + s(chl)+ s(mean,chl), family =tw(), data= filter)

Test whether the three types of microbes exhibit different relationships with the two predictors. Use appropriate GAM(s), hypothesis tests, and smoother plots to assess these questions.

plot(gm2, select=3, lwd = 2, scheme = 2, rug = T, se = F)

plot(gm2, select = 1, residuals = T, shade.col = 'yellow', shade = T, col = 'black')

plot(gm2, select = 2, residuals = T, shade.col = 'yellow', shade = T, col = 'black')

summary(gm2)
## 
## Family: Tweedie(p=1.66) 
## Link function: log 
## 
## Formula:
## conc ~ microbe + s(mean) + s(chl) + s(mean, chl)
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.567238   0.007079  221.40   <2e-16 ***
## microbepicoeuk -6.167367   0.021422 -287.89   <2e-16 ***
## microbepro     -0.858184   0.010826  -79.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df      F  p-value    
## s(mean)     1.001  1.001  0.043    0.837    
## s(chl)      2.341  2.935 10.484 1.78e-06 ***
## s(mean,chl) 8.433 27.000  1.487 5.98e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.953   Deviance explained = 98.5%
## -REML = -1100.1  Scale est. = 0.035024  n = 1230

WHIFFED IT. What did I even do here? Give me an F.